library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.0
## ✓ readr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(fastDummies)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
data<-read.table("base_trabajo_segmentacion.csv" ,sep = ";",header = TRUE)
data<-na.omit(data)
#Modificando las variables
datos_reducidos=mutate(data, en_vm_otros_ = en_vm_canal6+en_vm_canal7+en_vm_canal8+en_vm_canal9+en_vm_canal10+en_vm_otros,en_tx_otros_=en_tx_canal6+en_tx_canal7+en_tx_canal8+en_tx_canal9+en_tx_canal10+en_tx_otros)
borrar <- c("nit","en_vm_canal6","en_vm_canal7","en_vm_canal8","en_vm_canal9","en_vm_canal10","en_vm_otros","en_tx_canal6","en_tx_canal7","en_tx_canal8","en_tx_canal9","en_tx_canal10","en_tx_otros")
datos2 <- datos_reducidos[ , !(names(datos_reducidos) %in% borrar)]#En este paso se eliminan las variables sumadas ateriormente
datos_categoricos_seleccionados=datos2[19:34]#Se seleccionan y almacenan los datos categóricos
datos_categoricos=c("impo_cv","expo_vt","cxp","cxc","totalinventory","pagos_pj","pagos_pn","tiene_ventas_fisicas","tiene_ventas_electronicas","recaudos_pj","recaudos_pn","rotacion_inventarios","rotacion_cxc","rotacion_cxp","ciclo_negocio","ciclo_financiero")
datos_numericos <- datos2[ , !(names(datos2) %in% datos_categoricos)]#Para hacer los primeros análisi se dejan únicamente las variables cuantitativas del modelo
Sigma_t<-cov(scale(datos_numericos,center=T,scale=T))
descomp_espectr_t<-eigen(Sigma_t)
lambdas_t<-descomp_espectr_t$values
D_t<-descomp_espectr_t$vectors
#En caso de ser necesario, se hará un análisis del ACP
acprincipales=prcomp(datos_numericos,scale=T)
acprincipales
## Standard deviations (1, .., p=20):
## [1] 2.50011551 1.58117656 1.44141911 1.23849358 1.08063358 1.04720979
## [7] 1.01300694 0.96792533 0.86610611 0.79281554 0.73493703 0.62590255
## [13] 0.62156148 0.56804575 0.43800337 0.32844986 0.22978995 0.18026508
## [19] 0.07335472 0.01626347
##
## Rotation (n x k) = (20 x 20):
## PC1 PC2 PC3 PC4 PC5
## en_vm_canal1 0.196111232 0.298677762 -0.01758397 -0.070098205 0.38572320
## en_vm_canal2 0.269752007 0.142552018 -0.05219006 -0.006563866 0.22423646
## en_vm_canal3 0.201450180 0.387128867 -0.14771157 0.046934354 -0.16864279
## en_vm_canal4 0.297846107 -0.366537491 -0.04855269 0.034662152 -0.03653591
## en_vm_canal5 0.355720357 -0.135357501 -0.04741409 0.023337027 -0.14771284
## en_tx_canal1 0.161277692 0.336792134 -0.10955874 0.035322237 0.12588672
## en_tx_canal2 0.199525929 0.080423283 0.12993596 -0.116914463 0.51261720
## en_tx_canal3 0.153102310 0.360694034 -0.16532312 0.094299212 -0.44513141
## en_tx_canal4 0.293207788 -0.385478742 -0.04872198 0.039279168 -0.07593586
## en_tx_canal5 0.130764803 0.084102588 0.55763055 -0.177644631 -0.24835955
## sal_vm_canal5 0.355549688 -0.133621125 -0.04662081 0.022506824 -0.14639603
## sal_vm_canal2 0.366717606 0.048729711 -0.08218690 0.015839151 0.14993884
## sal_vm_canal8 0.002343343 0.006372857 0.01941851 -0.034607005 0.15363050
## sal_vm_otros 0.007285194 0.023670168 0.24532151 0.657392366 0.09991336
## sal_tx_canal5 0.127322078 0.091094294 0.59295464 -0.226633702 -0.20373390
## sal_tx_canal2 0.218138109 0.068249988 0.14034123 -0.048434239 0.03712224
## sal_tx_canal8 0.021668280 0.053766505 0.23075955 -0.201736318 0.15099951
## sal_tx_otros 0.011075602 0.024640324 0.29602421 0.629795232 0.07221945
## en_vm_otros_ 0.322299875 -0.224979617 -0.05906251 0.034785163 -0.01017554
## en_tx_otros_ 0.106727001 0.307190677 -0.12590279 0.088355427 -0.23795516
## PC6 PC7 PC8 PC9 PC10
## en_vm_canal1 0.38784352 -0.158015511 0.031190032 -0.244176757 -0.105757642
## en_vm_canal2 -0.14981641 -0.034504242 -0.002941420 0.206077140 0.225225036
## en_vm_canal3 0.41902490 -0.054678055 0.035633956 0.031051248 -0.006366434
## en_vm_canal4 -0.04879406 0.065205860 -0.061863365 -0.020648199 -0.161372706
## en_vm_canal5 0.18396294 -0.007619506 0.010779230 0.022846372 0.044592155
## en_tx_canal1 -0.36749841 0.058982604 -0.057909195 -0.496657067 0.316200015
## en_tx_canal2 -0.12587425 -0.108488905 0.041523555 0.321779379 -0.572382288
## en_tx_canal3 0.10933945 0.114410260 -0.034024746 0.362023347 -0.112299104
## en_tx_canal4 -0.04053301 0.077059198 -0.068586851 -0.058113415 -0.116324713
## en_tx_canal5 -0.04326276 -0.144373112 0.230030724 -0.170923173 -0.085522905
## sal_vm_canal5 0.18669821 -0.006814179 0.008649184 0.021112499 0.046441524
## sal_vm_canal2 -0.00644150 -0.045821833 0.012428190 -0.142945481 0.128301143
## sal_vm_canal8 0.11712594 0.781877402 0.590616547 -0.017626434 0.010102318
## sal_vm_otros 0.04095832 0.041137852 -0.059333702 0.014159818 -0.005855127
## sal_tx_canal5 0.01367655 0.018591611 -0.016655149 -0.129402957 -0.009948242
## sal_tx_canal2 -0.33799442 -0.070377237 0.164463846 0.529822797 0.444451488
## sal_tx_canal8 0.16692480 0.481973381 -0.727225701 0.121506884 0.138136008
## sal_tx_otros 0.04169115 0.017618524 -0.021902401 0.008097712 0.014567410
## en_vm_otros_ -0.09152651 0.062152440 -0.041063840 -0.189603343 0.018701110
## en_tx_otros_ -0.49624393 0.233409173 -0.137932019 -0.111838455 -0.461437218
## PC11 PC12 PC13 PC14 PC15
## en_vm_canal1 0.244983519 -0.076868729 0.227392762 -0.253507452 0.259041920
## en_vm_canal2 -0.733779203 -0.021298080 0.052792305 -0.354801761 0.162317314
## en_vm_canal3 0.181178574 0.066069076 -0.210840554 -0.046606806 0.159757171
## en_vm_canal4 0.040729356 0.025725848 -0.054347350 0.069695839 0.516078313
## en_vm_canal5 -0.049859626 -0.133249517 0.391517763 0.174090569 -0.285367875
## en_tx_canal1 -0.015861029 0.013137148 -0.005153170 0.561993318 0.127307880
## en_tx_canal2 -0.013133910 0.053841274 -0.112440903 0.371155474 -0.160374817
## en_tx_canal3 -0.155617204 0.100723364 -0.258779315 0.240420692 0.073983600
## en_tx_canal4 0.013297342 0.012600530 -0.024225234 0.032751845 0.414029786
## en_tx_canal5 -0.097872306 -0.019847792 -0.056238204 -0.020676874 0.039606046
## sal_vm_canal5 -0.047639635 -0.134561698 0.395587850 0.175561794 -0.285611190
## sal_vm_canal2 -0.003463341 0.087481855 -0.253303179 -0.178270558 -0.311099119
## sal_vm_canal8 -0.017523663 -0.003077326 0.005043460 0.001355643 0.002521448
## sal_vm_otros 0.001434063 -0.663600744 -0.222051286 0.014546280 -0.003942049
## sal_tx_canal5 -0.069044365 -0.014526828 -0.050418801 -0.019840603 0.031182920
## sal_tx_canal2 0.520178011 -0.049538124 0.090962104 -0.070959912 0.109624164
## sal_tx_canal8 0.061178064 0.014495336 0.008000959 0.007213890 -0.029435993
## sal_tx_otros 0.003743195 0.672295752 0.234301002 -0.007734474 -0.001142211
## en_vm_otros_ 0.166777069 0.165464179 -0.485462477 -0.236533354 -0.340826480
## en_tx_otros_ 0.143769655 -0.093356080 0.299984533 -0.367848658 -0.066839370
## PC16 PC17 PC18 PC19
## en_vm_canal1 -0.4554747473 0.1060774569 -0.0571497707 1.312923e-02
## en_vm_canal2 0.0973312044 0.1517185393 -0.1030954239 8.996487e-04
## en_vm_canal3 0.6778960289 0.0843106002 -0.0113631201 -1.095461e-02
## en_vm_canal4 0.0368471524 -0.4495182149 -0.5048161416 -1.958514e-02
## en_vm_canal5 0.0267974728 0.0396445355 -0.0643985751 -3.467268e-02
## en_tx_canal1 0.0235883333 0.0883747263 -0.0451777472 -4.487990e-03
## en_tx_canal2 0.1241789037 0.0680772535 0.0132591961 2.166125e-02
## en_tx_canal3 -0.5263892043 -0.0145532391 0.0160122640 7.560946e-03
## en_tx_canal4 0.0003809804 0.3272617773 0.6675951386 1.892525e-02
## en_tx_canal5 -0.0160306206 0.0047850827 0.0151394914 -6.655112e-01
## sal_vm_canal5 0.0293774798 0.0410383414 -0.0673642077 2.898858e-02
## sal_vm_canal2 -0.0483780641 -0.6540193087 0.4049367122 -3.842812e-03
## sal_vm_canal8 0.0080772555 -0.0072415633 0.0037986875 1.770073e-03
## sal_vm_otros -0.0053432403 0.0001224729 -0.0009743556 -2.370944e-06
## sal_tx_canal5 0.0142207103 -0.0275674395 -0.0109960727 7.087737e-01
## sal_tx_canal2 -0.0261467939 0.0275080847 0.0049916138 -3.990052e-03
## sal_tx_canal8 0.0248266151 -0.0143279827 0.0119539773 -2.259741e-01
## sal_tx_otros 0.0038072093 0.0026770629 -0.0008674446 -1.120593e-04
## en_vm_otros_ -0.1216932174 0.4506043292 -0.3311267779 1.783503e-03
## en_tx_otros_ 0.0922375963 -0.0308490189 0.0119341021 3.055133e-03
## PC20
## en_vm_canal1 -2.296608e-03
## en_vm_canal2 7.314391e-04
## en_vm_canal3 -1.254960e-03
## en_vm_canal4 1.049905e-03
## en_vm_canal5 -7.078796e-01
## en_tx_canal1 -1.396618e-03
## en_tx_canal2 -4.436135e-04
## en_tx_canal3 1.915983e-03
## en_tx_canal4 9.653443e-04
## en_tx_canal5 3.051265e-02
## sal_vm_canal5 7.048763e-01
## sal_vm_canal2 2.319367e-03
## sal_vm_canal8 -6.905418e-05
## sal_vm_otros -2.696364e-05
## sal_tx_canal5 -3.227495e-02
## sal_tx_canal2 -1.027991e-05
## sal_tx_canal8 7.879954e-03
## sal_tx_otros -8.026732e-05
## en_vm_otros_ 6.812733e-04
## en_tx_otros_ 1.037106e-05
#Qué % de variablidad es explicada para cada componente:
prop_varianza <- acprincipales$sdev^2 / sum(acprincipales$sdev^2)
prop_varianza*100
## [1] 31.252887750 12.500596548 10.388445237 7.669331760 5.838844620
## [6] 5.483241744 5.130915334 4.684397232 3.750698974 3.142782372
## [11] 2.700662212 1.958770004 1.931693356 1.613379883 0.959234753
## [16] 0.539396539 0.264017105 0.162477498 0.026904577 0.001322502
ggplot(data = data.frame(prop_varianza, pc = 1:20),
aes(x = pc, y = prop_varianza)) +
geom_col(width = 0.3) +
scale_y_continuous(limits = c(0,1)) +
theme_bw() +
labs(x = "Componente principal",
y = "Prop. de varianza explicada")
prop_varianza_acum <- cumsum(prop_varianza)
prop_varianza_acum
## [1] 0.3125289 0.4375348 0.5414193 0.6181126 0.6765011 0.7313335 0.7826426
## [8] 0.8294866 0.8669936 0.8984214 0.9254280 0.9450157 0.9643327 0.9804665
## [15] 0.9900588 0.9954528 0.9980930 0.9997177 0.9999868 1.0000000
ggplot(data = data.frame(prop_varianza_acum, pc = 1:20),
aes(x = pc, y = prop_varianza_acum, group = 1)) +
geom_point() +
geom_line() +
theme_bw() +
labs(x = "Componente principal",
y = "Prop. varianza explicada acumulada")
#Con las 9 primeras compoentes principales se explica el 87% de la variabilidad total
#Datos proyectados con las 9 primeras componentes principales
datos_proyectados=acprincipales$x#Acá está con todas sus componetes principales
datos_proyectados_reducidos=datos_proyectados[,1:9]
#Los vectores propios son los siguientes:
vec_propios=acprincipales$rotation
val_propios=lambdas_t
#Tomaremos únicamente los necesarios
D_red_t<-vec_propios[,1:9]
lambdas_red_t<-val_propios[1:9]
#Se recontruye la matriz de covarianzas escalada
Sigma_rec_t<-D_red_t%*%diag(lambdas_red_t)%*%t(D_red_t)
datos_reconstruidos_esc_cent_t<-datos_proyectados_reducidos%*%t(D_red_t)
#Ahora, se hace el clustering con las 9 compnentes principales
#Recordemos que éstas están almacenadas en la varianle datos_proyectados_reducidos
set.seed(1234)
wcss <- vector()
for(i in 1:20){
wcss[i] <- sum(kmeans(datos_proyectados_reducidos, i)$withinss)
}
#Se verifica el número de centroides óptimo
ggplot() + geom_point(aes(x = 1:20, y = wcss), color = 'blue') +
geom_line(aes(x = 1:20, y = wcss), color = 'blue') +
ggtitle("Método del Codo") +
xlab('Cantidad de Centroides k') +
ylab('WCSS')
#Enotro gráfico más bonito
set.seed(3) # Se fija la semilla para obtener resultados reproducibles
# Cálculo de los grupos
centers <- 2:10 # este es el valor de K
resultados <- vector(mode="list",length = 10) # en esta lista se almacenan los resultados de cada agrupamiento
for (i in 1:length(centers)){
resultados[[i]] <- kmeans(x=datos_proyectados_reducidos,centers=centers[i],nstart = 3)
}
# Extracción de la métrica de desempeño ("withinss") para cada K:
metrica_cl <- do.call("rbind",lapply(resultados,"[[",5))
num_centros <- 2:10
res_num_cen <- data.frame(num_centros,metrica_cl)
grph_metrica_cl <- ggplot(res_num_cen,aes(x=num_centros,xend=num_centros,y=0,yend=metrica_cl))
grph_metrica_cl + geom_point(aes(x=num_centros,y=metrica_cl)) + geom_segment() + theme_bw() + labs(title = "Desempeño del agrupamiento \n en función de K",
x = "K (cantidad de centros)",
y = "Métrica de desempeño")
#Por ahora se seleccionan 5 grupos
set.seed(123)
for (i in 2:8){wines_K2 <- kmeans(datos_proyectados_reducidos, centers =i , nstart = 25)
print(wines_K2$size)
}
## [1] 6 2227
## [1] 24 2208 1
## [1] 2180 6 46 1
## [1] 6 1 2171 53 2
## [1] 52 2 1 1 6 2171
## [1] 6 203 2003 2 1 17 1
## [1] 1 2004 1 11 6 6 202 2
set.seed(123)
kmenas=kmeans(datos_proyectados_reducidos, centers =5 , nstart = 25)
fviz_cluster(kmenas, data = datos_numericos)
acprincipales$scale
## en_vm_canal1 en_vm_canal2 en_vm_canal3 en_vm_canal4 en_vm_canal5
## 8.628981e+09 2.420627e+10 1.897338e+09 9.285151e+09 2.270161e+10
## en_tx_canal1 en_tx_canal2 en_tx_canal3 en_tx_canal4 en_tx_canal5
## 7.015267e+03 2.792873e+02 1.222524e+04 7.483593e+04 1.865614e+01
## sal_vm_canal5 sal_vm_canal2 sal_vm_canal8 sal_vm_otros sal_tx_canal5
## 2.276978e+10 3.196322e+10 5.087747e+08 1.648173e+07 1.979337e+01
## sal_tx_canal2 sal_tx_canal8 sal_tx_otros en_vm_otros_ en_tx_otros_
## 2.764380e+03 9.716711e+01 1.360357e+01 2.055489e+10 1.187329e+04
biplot(x=acprincipales,scale = 0)
dfPCA <- as.data.frame(acprincipales$x)
dfPCA <- cbind(dfPCA, kmenas$cluster )
dfPCA$`kmenas$cluster` <- as.factor(dfPCA$`kmenas$cluster`)
dfPCA$`kmenas$cluster` <- fct_recode(dfPCA$`kmenas$cluster`, "Grupo 1" = "1",
"Grupo 2" = "2",
"Grupo 3" = "3",
"Grupo 4" = "4",
"Grupo 5" = "5" )
plotly::plot_ly(dfPCA, x=~PC1, y=~PC2, z=~PC3, color=dfPCA$`kmenas$cluster` )
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode